library(tidyverse)
library(lubridate)
library(plotly)
library(skimr)
theme_set(theme_minimal())df_case <- read.csv("../data/case_cost_prediction_train.csv")
df_case <- df_case %>%
mutate(tglpelayanan = as.Date(tglpelayanan))
df_case <- df_case %>%
arrange(kddati2, tkp, tglpelayanan)
# Validation data
df_case_val <- read.csv("../data/case_cost_prediction_val.csv")
df_case_val <- df_case_val %>%
mutate(tglpelayanan = as.Date(tglpelayanan))
df_case_val <- df_case_val %>%
arrange(kddati2, tkp, tglpelayanan)
# Combine
df_case_all <- bind_rows(
mutate(df_case, cat = "Train"),
mutate(df_case_val, cat = "Test")
) %>%
arrange(kddati2, tkp, tglpelayanan)
# Small features engineering
df_case <- df_case %>%
mutate(m = month(tglpelayanan),
y = year(tglpelayanan)) %>%
mutate(comb = paste0(a, b, c, cb, d, ds, gd, hd, i1, i2, i3, i4, kb, kc, kg, ki, kj, kk, kl, km, ko, kp, kt, ku,
s, sa, sb, sc, sd))
df_monthly <- df_case %>%
group_by(tglpelayanan, tkp) %>%
summarise(total_case = sum(case),
total_cost = sum(unit_cost * case),
avg_cost = total_cost / total_case,
kd = n_distinct(kddati2),
case_per_kd = total_case / kd,
cost_per_kd = total_cost / kd)
# KD level
df_kd <- df_case %>%
group_by(kddati2, tglpelayanan, tkp, comb) %>%
summarise(cnt = n(),
peserta = sum(peserta),
case = sum(case),
case_ratio = case / peserta,
unit_cost = sum(unit_cost),
cost_per_case = unit_cost / case)df_case_all <- df_case_all %>%
# rowwise() %>%
mutate(comb_sum = a+b+c+cb+d+ds+gd+hd+i1+i2+i3+i4+kb+kc+kg+ki+kj+kk+kl+km+ko+kp+kt+ku+s+sa+sb+sc+sd)Validation
df_case_val %>%
count(kddati2)Not existed in testing
df_case %>%
count(kddati2, tkp) %>%
rename(n_train = n) %>%
anti_join(
df_case_val %>%
count(kddati2, tkp) %>%
rename(n_test = n)
)df_case_all %>%
count(kddati2, tglpelayanan, peserta) %>%
filter(n != 2)How many dates?
p <- df_case %>%
count(tglpelayanan) %>%
ggplot(aes(tglpelayanan, n)) +
geom_line()
ggplotly(p)How many cases per date?
p <- df_monthly %>%
ggplot(aes(tglpelayanan, total_case, color = factor(tkp))) +
geom_line()
ggplotly(p)Unit cost?
p <- df_monthly %>%
filter(tkp == 30, tglpelayanan < ymd(20210601)) %>%
ggplot(aes(tglpelayanan, avg_cost, color = factor(tkp))) +
geom_line()
ggplotly(p)p <- df_monthly %>%
filter(tkp == 40, tglpelayanan < ymd(20210601)) %>%
ggplot(aes(tglpelayanan, avg_cost, color = factor(tkp))) +
geom_line()
ggplotly(p)df_case %>%
filter(tkp == 30) %>%
select(row_id, tglpelayanan, kddati2) %>%
mutate(cat = "Train") %>%
bind_rows(
df_case_val %>%
filter(tkp == 30) %>%
select(row_id, tglpelayanan, kddati2) %>%
mutate(cat = "Test")
) %>%
mutate(kddati2 = as.factor(kddati2)) %>%
ggplot(aes(kddati2, tglpelayanan, fill = cat)) +
coord_flip(expand = c(0,0)) +
geom_tile() +
theme_minimal()df_case_all %>%
filter(tkp == 30) %>%
group_by(tglpelayanan, cat) %>%
summarise(cnt = n()) %>%
ggplot(aes(tglpelayanan, cnt, fill = cat)) +
geom_col() +
theme_minimal()df_casecor_df <- df_case_all %>%
group_by(kddati2, tkp) %>%
# mutate_at(vars(a, b, c, cb, d, ds, gd, hd, i1, i2, i3, i4, kb, kc, kg, ki, kj, kk, kl, km, ko, kp, kt, ku,
# s, sa, sb, sc, sd, comb_sum, case),
# funs(diff = . - lag(.))) %>%
mutate_at(vars(a, b, c, cb, d, ds, gd, hd, i1, i2, i3, i4, kb, kc, kg, ki, kj, kk, kl, km, ko, kp, kt, ku,
s, sa, sb, sc, sd, comb_sum, case, unit_cost),
funs(lag = lag(.),
lead = lead(.))) %>%
ungroup() %>%
select(-ds) %>%
filter(cat == "Train") %>%
select_if(is.numeric) %>%
corrr::correlate() %>%
corrr::focus(case, unit_cost) %>%
arrange(desc(abs(case)))
cor_dfcor_df %>%
arrange(desc(abs(unit_cost)))df_case %>%
select(peserta, case, unit_cost) %>%
corrr::correlate()df_case_wider <- df_case %>%
select(tglpelayanan, kddati2, tkp, peserta, case, unit_cost) %>%
pivot_wider(id_cols = c(tglpelayanan, kddati2),
names_from = tkp, values_from = c(peserta, case, unit_cost)) %>%
arrange(kddati2, tglpelayanan) %>%
mutate(peserta = coalesce(peserta_30, peserta_40)) %>%
mutate_at(vars(case_30, case_40, unit_cost_30, unit_cost_40, peserta),
funs(lag1 = lag(., 1),
lag2 = lag(., 2),
lag3 = lag(., 3),
lead1 = lead(., 1),
lead2 = lead(., 2),
lead3 = lead(., 3)))
# all same
# df_case_wider %>%
# filter(!is.na(peserta_30), !is.na(peserta_40)) %>%
# filter(peserta_30 != peserta_40) %>%
# View()
# 20,206
df_case_wider %>%
select(peserta, case_30, case_40, unit_cost_30, unit_cost_40) %>%
drop_na() %>%
corrr::correlate()# 10,802
df_case_wider %>%
select(peserta, case_30, case_40, unit_cost_30, unit_cost_40, contains("lag1")) %>%
drop_na() %>%
corrr::correlate() %>%
corrr::stretch(na.rm = TRUE, remove.dups = TRUE) %>%
arrange(desc(r))# 10,802
df_case_wider %>%
select(peserta, case_30, case_40, unit_cost_30, unit_cost_40, contains("lead1")) %>%
drop_na() %>%
corrr::correlate() %>%
corrr::stretch(na.rm = TRUE, remove.dups = TRUE) %>%
filter(x %in% c("case_30", "case_40", "unit_cost_30", "unit_cost_40") |
y %in% c("case_30", "case_40", "unit_cost_30", "unit_cost_40")) %>%
arrange(desc(r))# 5,755
df_case_wider %>%
select(peserta, case_30, case_40, unit_cost_30, unit_cost_40, contains("lead1"), contains("lag1")) %>%
drop_na() %>%
corrr::correlate() %>%
corrr::stretch(na.rm = TRUE, remove.dups = TRUE) %>%
filter(x %in% c("case_30", "case_40", "unit_cost_30", "unit_cost_40") |
y %in% c("case_30", "case_40", "unit_cost_30", "unit_cost_40")) %>%
arrange(desc(r))# MAPE >= 200%?
df_case %>%
mutate(case_mean = mean(case),
case_sd = sd(case))viz_kd <- function(id) {
res <- df_case_all %>%
filter(kddati2 == id)
res %>%
group_by(tglpelayanan, metrics = "peserta") %>%
summarise(val = mean(peserta, na.rm = TRUE)) %>%
ungroup() %>%
bind_rows(
res %>%
filter(tkp == 30) %>%
transmute(tglpelayanan, metrics = "case_30", val = case, cat),
res %>%
filter(tkp == 40) %>%
transmute(tglpelayanan, metrics = "case_40", val = case, cat)
) %>%
ggplot(aes(tglpelayanan, val, fill = metrics)) +
geom_line() +
facet_wrap(~metrics, scales = "free_y", nrow = 3) +
labs(subtitle = id)
}
viz_kd(300)viz_kd(50)df_pred <- df_case %>%
group_by(kddati2) %>%
mutate(predict_case = median(case),
predict_unit_cost = median(unit_cost)) %>%
ungroup() %>%
select(row_id, case, predict_case, unit_cost, predict_unit_cost)
set.seed(100)
case_random <- runif(nrow(df_case), 0.85, 1.15)
set.seed(1000)
unit_cost_random <- runif(nrow(df_case), 0.95, 1.07)
df_pred <- df_case %>%
mutate(case_random_ = case_random,
unit_cost_random_ = unit_cost_random) %>%
mutate(predict_case = case * case_random_,
predict_unit_cost = unit_cost * unit_cost_random) %>%
select(row_id, case, predict_case, unit_cost, predict_unit_cost)
mape <- function(y_actual, y_pred) {
mean(abs( (y_actual - y_pred) / y_actual ))
}
mae <- function(y_actual, y_pred) {
mean(abs( (y_actual - y_pred) ))
}
# MAPE
cat("MAPE Cost", mape(df_pred$unit_cost, df_pred$predict_unit_cost), "\n")## MAPE Cost 0.03080327
cat("MAPE Case", mape(df_pred$case, df_pred$predict_case), "\n")## MAPE Case 0.07498841
# MAE
cat("MAPE Cost", mae(df_pred$unit_cost, df_pred$predict_unit_cost), "\n")## MAPE Cost 60476.69
cat("MAPE Case", mae(df_pred$case, df_pred$predict_case), "\n")## MAPE Case 489.5009
Baseline average previous and after value
df_pred <- df_case %>%
group_by(kddati2, tkp) %>%
mutate(predict_case = (coalesce(lag(case), lead(case)) + coalesce(lead(case), lag(case))) / 2,
predict_unit_cost = (coalesce(lag(unit_cost), lead(unit_cost)) + coalesce(lead(unit_cost), lag(unit_cost))) / 2) %>%
ungroup() %>%
select(row_id, case, predict_case, unit_cost, predict_unit_cost) %>%
filter(!is.na(predict_case), !is.na(predict_unit_cost))
# MAPE
cat("MAPE Cost", mape(df_pred$unit_cost, df_pred$predict_unit_cost), "\n")## MAPE Cost 0.04524511
cat("MAPE Case", mape(df_pred$case, df_pred$predict_case), "\n")## MAPE Case 1.90452
# MAE
cat("MAE Cost", mae(df_pred$unit_cost, df_pred$predict_unit_cost), "\n")## MAE Cost 77977.58
cat("MAE Case", mae(df_pred$case, df_pred$predict_case), "\n")## MAE Case 641.8952
Lead
df_pred <- df_case %>%
group_by(kddati2, tkp) %>%
mutate(predict_case = lead(case),
predict_unit_cost = lead(unit_cost)) %>%
ungroup() %>%
select(tglpelayanan, row_id, case, predict_case, unit_cost, predict_unit_cost) %>%
filter(!is.na(predict_case), !is.na(predict_unit_cost))
# MAPE
cat("MAPE Cost", mape(df_pred$unit_cost, df_pred$predict_unit_cost), "\n")## MAPE Cost 0.05404623
cat("MAPE Case", mape(df_pred$case, df_pred$predict_case), "\n")## MAPE Case 0.2773798
# MAE
cat("MAE Cost", mae(df_pred$unit_cost, df_pred$predict_unit_cost), "\n")## MAE Cost 92263.18
cat("MAE Case", mae(df_pred$case, df_pred$predict_case), "\n")## MAE Case 753.4279
Use data test too
df_pred <- df_case_all %>%
group_by(kddati2, tkp) %>%
mutate(predict_case = (coalesce(lag(case), lead(case)) + coalesce(lead(case), lag(case))) / 2,
predict_unit_cost = (coalesce(lag(unit_cost), lead(unit_cost)) + coalesce(lead(unit_cost), lag(unit_cost))) / 2) %>%
ungroup() %>%
select(tglpelayanan, row_id, case, predict_case, unit_cost, predict_unit_cost, cat) %>%
filter(!is.na(predict_case), !is.na(predict_unit_cost), cat == "Train")
# MAPE
cat("MAPE Cost", mape(df_pred$unit_cost, df_pred$predict_unit_cost), "\n")## MAPE Cost 0.04660915
cat("MAPE Case", mape(df_pred$case, df_pred$predict_case), "\n")## MAPE Case 1.667445
# MAE
cat("MAE Cost", mae(df_pred$unit_cost, df_pred$predict_unit_cost), "\n")## MAE Cost 79258.34
cat("MAE Case", mae(df_pred$case, df_pred$predict_case), "\n")## MAE Case 655.3304
df_pred <- df_case_all %>%
group_by(kddati2, tkp) %>%
mutate(predict_case = (coalesce(lag(case), lead(case)) + coalesce(lead(case), lag(case))) / 2,
predict_unit_cost = (coalesce(lag(unit_cost), lead(unit_cost)) + coalesce(lead(unit_cost), lag(unit_cost))) / 2) %>%
ungroup() %>%
select(row_id, case, predict_case, unit_cost, predict_unit_cost, cat) %>%
filter(cat == "Test")df_pred <- df_pred %>%
mutate(pe = abs( (unit_cost - predict_unit_cost) / unit_cost ) )
df_pred %>%
summary()## row_id case predict_case unit_cost
## Min. : 1 Min. : NA Min. : 1.0 Min. : NA
## 1st Qu.: 6159 1st Qu.: NA 1st Qu.: 436.2 1st Qu.: NA
## Median :12317 Median : NA Median : 1409.0 Median : NA
## Mean :12317 Mean :NaN Mean : 6485.8 Mean :NaN
## 3rd Qu.:18475 3rd Qu.: NA 3rd Qu.: 4574.0 3rd Qu.: NA
## Max. :24633 Max. : NA Max. :319056.0 Max. : NA
## NA's :24633 NA's :2326 NA's :24633
## predict_unit_cost cat pe
## Min. : 115621 Length:24633 Min. : NA
## 1st Qu.: 234019 Class :character 1st Qu.: NA
## Median : 1689109 Mode :character Median : NA
## Mean : 1975835 Mean :NaN
## 3rd Qu.: 3544800 3rd Qu.: NA
## Max. :26905500 Max. : NA
## NA's :2326 NA's :24633
Check data completeness
test_fe <- df_case_val %>%
select(row_id, tglpelayanan, kddati2, tkp, peserta) %>%
mutate(cat = "Test") %>%
bind_rows(
df_case %>%
select(row_id, tglpelayanan, kddati2, tkp, peserta, case, unit_cost) %>%
mutate(cat = "Train")
) %>%
arrange(kddati2, tkp, tglpelayanan) %>%
group_by(kddati2, tkp) %>%
mutate_at(vars(case, peserta),
funs(lag1 = lag(., 1),
lag2 = lag(., 2),
lag3 = lag(., 3),
lag6 = lag(., 6),
lead1 = lead(., 1),
lead2 = lead(., 2),
lead3 = lead(., 3),
lead6 = lead(., 6),)) %>%
ungroup()
test_fe %>%
group_by(cat) %>%
summarise_if(is.numeric, ~sum(ifelse(!is.na(.), 1, 0))) %>%
gather(key, val, -cat) %>%
group_by(cat) %>%
mutate(val_max = max(val)) %>%
filter(grepl("lag|lead", key)) %>%
mutate(val_pct = val / val_max)Check submission results
submission <- read.csv("../submission/tahap2_case_cost_prediction.csv") %>%
mutate(cat = "Test")
df_case_final <- df_case_all %>%
left_join(submission) %>%
mutate(case = coalesce(case, predict_case),
unit_cost = coalesce(unit_cost, predict_unit_cost))
df_case_final %>%
filter(cat == "Test") %>%
select(predict_case, predict_unit_cost) %>%
summary()## predict_case predict_unit_cost
## Min. : -114.7 Min. : 130130
## 1st Qu.: 422.8 1st Qu.: 236190
## Median : 1386.0 Median : 1729663
## Mean : 6319.7 Mean : 1972586
## 3rd Qu.: 4394.3 3rd Qu.: 3545424
## Max. :172498.8 Max. :11258508
viz_kd <- function(id) {
res <- df_case_final %>%
filter(kddati2 == id)
res %>%
group_by(tglpelayanan, metrics = "peserta") %>%
summarise(val = mean(peserta, na.rm = TRUE)) %>%
ungroup() %>%
bind_rows(
res %>%
filter(tkp == 30) %>%
transmute(tglpelayanan, metrics = "case_30", val = case, cat),
res %>%
filter(tkp == 40) %>%
transmute(tglpelayanan, metrics = "case_40", val = case, cat)
) %>%
ggplot(aes(tglpelayanan, val, fill = metrics)) +
geom_line() +
geom_point(aes(tglpelayanan, val, color = cat)) +
facet_wrap(~metrics, scales = "free_y", nrow = 3) +
labs(subtitle = id)
}
viz_kd(5)submission <- read.csv("../submission/tahap2_case_cost_prediction_V2.csv") %>%
mutate(cat = "Test")
df_case_final <- df_case_all %>%
left_join(submission) %>%
mutate(case = coalesce(case, predict_case),
unit_cost = coalesce(unit_cost, predict_unit_cost))
df_case_final %>%
filter(cat == "Test") %>%
select(predict_case, predict_unit_cost) %>%
summary()## predict_case predict_unit_cost
## Min. : 1.0 Min. : 115621
## 1st Qu.: 421.3 1st Qu.: 234319
## Median : 1379.3 Median : 1700871
## Mean : 6226.3 Mean : 1972148
## 3rd Qu.: 4440.6 3rd Qu.: 3544692
## Max. :143554.5 Max. :16992385
viz_kd(5)sessionInfo()## R version 4.0.5 (2021-03-31)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] skimr_2.1.3 plotly_4.9.3 lubridate_1.7.10 forcats_0.5.1
## [5] stringr_1.4.0 dplyr_1.0.5 purrr_0.3.4 readr_1.4.0
## [9] tidyr_1.1.3 tibble_3.1.0 ggplot2_3.3.3 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.6 assertthat_0.2.1 digest_0.6.27 utf8_1.2.1
## [5] R6_2.5.0 cellranger_1.1.0 repr_1.1.3 backports_1.2.1
## [9] reprex_2.0.0 evaluate_0.14 highr_0.8 httr_1.4.2
## [13] pillar_1.5.1 rlang_0.4.10 lazyeval_0.2.2 readxl_1.3.1
## [17] rstudioapi_0.13 data.table_1.14.0 rmarkdown_2.7 labeling_0.4.2
## [21] htmlwidgets_1.5.3 munsell_0.5.0 broom_0.7.6 compiler_4.0.5
## [25] modelr_0.1.8 xfun_0.22 pkgconfig_2.0.3 base64enc_0.1-3
## [29] htmltools_0.5.1.1 tidyselect_1.1.0 corrr_0.4.3 fansi_0.4.2
## [33] viridisLite_0.3.0 crayon_1.4.1 dbplyr_2.1.1 withr_2.4.1
## [37] grid_4.0.5 jsonlite_1.7.2 gtable_0.3.0 lifecycle_1.0.0
## [41] DBI_1.1.1 magrittr_2.0.1 scales_1.1.1 cli_2.4.0
## [45] stringi_1.5.3 farver_2.1.0 fs_1.5.0 xml2_1.3.2
## [49] ellipsis_0.3.1 generics_0.1.0 vctrs_0.3.7 tools_4.0.5
## [53] glue_1.4.2 hms_1.0.0 crosstalk_1.1.1 yaml_2.2.1
## [57] colorspace_2.0-0 rvest_1.0.0 knitr_1.31 haven_2.3.1